Objectives: The goal of this kernel is to analyze the effect and flavor of 3 different type of Cannabis.
EDA includes datatable, skim, and plotly and xgboost with histogram is used as the model for this analysis.
If you have any question, please leave a comment and if you like the kernel, please give me an upvote~ Thanks!
library(tidyverse)
library(skimr)
library(highcharter)
# library(qdap)
library(tm)
library(plotly)
library(viridis)
library(wordcloud)
library(plotrix)
library(DescTools)
library(DescTools)
library(DT)
weed <- read_csv("input/cannabis.csv")
weed %>%
datatable(filter = 'top', options = list(
pageLength = 15, autoWidth = TRUE
))
weed %>% skim() %>% kable()
## Skim summary statistics
## n obs: 2351
## n variables: 6
##
## Variable type: character
##
## variable missing complete n min max empty n_unique
## ------------ -------- --------- ----- ---- ----- ------ ---------
## Description 33 2318 2351 4 1120 0 2312
## Effects 0 2351 2351 4 46 0 1655
## Flavor 46 2305 2351 3 30 0 1293
## Strain 0 2351 2351 2 30 0 2350
## Type 0 2351 2351 6 6 0 3
##
## Variable type: numeric
##
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## --------- -------- --------- ----- ----- ----- --- ---- ---- ---- ----- ---------
## Rating 0 2351 2351 4.31 0.84 0 4.2 4.4 4.7 5 <U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2585><U+2587>
weed %>% glimpse()
## Observations: 2,351
## Variables: 6
## $ Strain <chr> "100-Og", "98-White-Widow", "1024", "13-Dawgs", "2...
## $ Type <chr> "hybrid", "hybrid", "sativa", "hybrid", "hybrid", ...
## $ Rating <dbl> 4.0, 4.7, 4.4, 4.2, 4.6, 0.0, 4.4, 4.2, 4.6, 4.4, ...
## $ Effects <chr> "Creative,Energetic,Tingly,Euphoric,Relaxed", "Rel...
## $ Flavor <chr> "Earthy,Sweet,Citrus", "Flowery,Violet,Diesel", "S...
## $ Description <chr> "$100 OG is a 50/50 hybrid strain that packs a str...
by_type <- weed %>%
count(Type)
hchart(by_type, type = 'treemap', hcaes(x = 'Type', value = 'n', color = 'n'))
rm(by_type)
weed_effects <- weed %>%
mutate(Effects = str_split(Effects,',')) %>%
unnest(Effects)
weed_effects %>%
count(Effects) %>%
hchart(type = 'treemap', hcaes(x = 'Effects', value = 'n', color = 'n'))
weed_flavor <- weed %>%
filter(Flavor != 'none') %>%
mutate(Flavor = str_split(Flavor, ',')) %>%
unnest(Flavor)
weed_flavor %>%
count(Flavor) %>%
hchart(type = 'treemap', hcaes(x = 'Flavor', value = 'n', color = 'n'))
df1 <- weed %>%
group_by(name = Type, drilldown = Type) %>%
summarise(y = n()) %>%
arrange(desc(y))
df2 <- weed_effects %>%
group_by(Type, Effects) %>%
mutate(y = n(), colorByPoint = 1) %>%
arrange(desc(y)) %>%
group_by(name = Type, id = Type, colorByPoint) %>%
do(data = list_parse(
mutate(.,name = Effects, drilldown = tolower(paste(Type, Effects,sep=": "))) %>%
group_by(name, drilldown) %>%
summarise(y=n()) %>%
select(name, y, drilldown) %>%
arrange(desc(y))))
(a <- highchart() %>%
hc_chart(type = 'bar') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Type of Cannbis vs Effects") %>%
hc_add_theme(hc_theme_darkunica()))
rm(df1, df2)
df1 <- weed %>%
group_by(name = Type, drilldown = Type) %>%
summarise(y = n()) %>%
arrange(desc(y))
df2 <- weed_flavor %>%
group_by(Type, Flavor) %>%
mutate(y = n(), colorByPoint = 1) %>%
arrange(desc(y)) %>%
group_by(name = Type, id = Type, colorByPoint) %>%
do(data = list_parse(
mutate(.,name = Flavor, drilldown = tolower(paste(Type, Flavor,sep=": "))) %>%
group_by(name, drilldown) %>%
summarise(y=n()) %>%
select(name, y, drilldown) %>%
arrange(desc(y))))
b <- highchart() %>%
hc_chart(type = 'bar') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Type of Cannbis vs Flavor") %>%
hc_add_theme(hc_theme_darkunica())
rm(df1, df2)
lst <- list(
a,
b
)
hw_grid(lst, rowheight = 400)
rm(a, b, lst)
# clean corpus
cleanCorpus <- function(corpus){
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
v_stopwords <- c(stopwords("en"), c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt"))
corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
return(corpus.tmp)
}
# frequent terms
frequentTerms <- function(text){
s.cor <- Corpus(VectorSource(text))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl)
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# clean by each Type
clean_top_char <- function(dataset){
all_dialogue <- list()
namelist <- list()
for (i in 1:3){
top <- dataset %>% count(Type) %>% arrange(desc(n)) %>% head(20)
name <- top$Type[i]
Description <- paste(dataset$Description[dataset$Type == name], collapse = " ")
all_dialogue <- c(all_dialogue, Description)
namelist <- c(namelist, name)
}
all_clean <- all_dialogue %>%
VectorSource() %>%
Corpus() %>%
cleanCorpus() %>%
TermDocumentMatrix() %>%
as.matrix()
colnames(all_clean) <- namelist
assign("all_clean",all_clean,.GlobalEnv)
all_clean %>% head()
}
weed %>% clean_top_char()
## Docs
## Terms hybrid indica sativa
## abandon 1 0 0
## abate 8 5 5
## abates 1 1 0
## abating 3 1 0
## abbreviated 1 0 0
## abduct 1 0 0
weed$Description %>%
frequentTerms() %>%
# dim()
head(30) %>%
mutate(word = factor(word))%>%
plot_ly(x = ~reorder(word,-freq), y = ~freq, colors = viridis(10)) %>%
add_bars(color = ~word) %>%
layout(title = "Top 30 Words",
yaxis = list(title = " "),
xaxis = list(title = ""),
margin = list(l = 100))
commonality.cloud(all_clean[,c("sativa","indica")], colors = "steelblue1", at.least = 2, max.words = 100)
comparison.cloud(all_clean[,c("sativa","indica")], colors = c("#F8766D", "#00BFC4"), max.words=50)
common_words <- all_clean %>%
as.data.frame() %>%
rownames_to_column() %>%
filter(sativa>0, indica>0) %>%
# select(sativa, indica)
mutate(difference = abs(sativa - indica)) %>%
arrange(desc(difference))
common_words_25 <- common_words%>%
head(25)
pyramid.plot(common_words_25$sativa, common_words_25$indica,
labels = common_words_25$rowname, gap = 200,
top.labels = c("sativa", "Words", "indica"),
main = "Words in Common", laxlab = NULL,
raxlab = NULL, unit = NULL)
## [1] 5.1 4.1 4.1 2.1
rm(common_words, common_words_25)
effects <- weed_effects$Effects %>% unique() %>% tolower()
rm(weed_effects)
effectByType <- all_clean %>%
as.data.frame() %>%
rownames_to_column('word') %>%
filter(word %in% effects) %>%
mutate(word=factor(word))
effectByType %>%
plot_ly(x=~hybrid,y=~sativa,z= ~indica, color=~word, hoverinfo = 'text', colors = viridis(15),
text = ~paste('Effects:', word,
'<br>hybrid:', hybrid,
'<br>sativa:', sativa,
'<br>indica:', indica)) %>%
add_markers(opacity = 0.8) %>%
layout(title = "Effects by Different Cannabis",
annotations=list(yref='paper',xref="paper",y=1.05,x=1.1, text="Effects",showarrow=F),
scene = list(xaxis = list(title = 'hybrid'),
yaxis = list(title = 'sativa'),
zaxis = list(title = 'indica')))
Hope you enjoyed the kernel and don’t forget to upvote~ Thanks a lot!